#!/bin/bash

set -o errexit
set -o pipefail

export BASE_DATA_PATH=/home/mixagol/data
export BASE_DATE=20120802
export LC_ALL=C

join -t$'\t' \
    <(cat ${BASE_DATA_PATH}/0_src/${BASE_DATE}/complete_genomes_uniq_l0.txt | cut -f1 | sort -t$'\t' -k1,1) \
    <(cat ${BASE_DATA_PATH}/7_aux/kegg/${BASE_DATE}/orgs_articles_full.txt | cut -f2,4 | grep -v None | sort -t$'\t' -k1,1) \
    > 8_other_systems/genome_free_articles.txt

cat 8_other_systems/genome_free_articles.txt | cut -f1,3 | cut -f2 | sort | uniq -c | sort -nr

# + вручную загрубляли методы до группы
# + выкинули 3 генома, которые не проаннотировали
cat 8_other_systems/genome_free_articles.txt \
    | awk -F'\t' '{print $1"\t"$3}'     \
    | grep -v None     \
    | grep -v '?'     \
    | grep -v 'Duplicate'     \
    | grep -v 'NC_008525|N_009440|NC_015520' \
    > 8_other_systems/genome_free_articles_grp.txt

cat 8_other_systems/genome_free_articles_grp.txt | cut -f2 | sort | uniq -c | sort -nr
#     39 NCBI, UniProt, TIGRFam, Pfam, PRIAM, KEGG, COG, and InterPro, IMG-ER
#     29 BLAST, similarity, homology
#      8 GenDB, BLAST, COG, COGnitor
#      5 InterPro(Scan)
#      2 HMM, FASTA
#      2 BLASTP, HMM
#      2 BLAST, IMG, JGI
#      1 TIGRFam, KEGG, COG

cat 8_other_systems/genome_free_articles_grp.txt \
    | cut -f1,2 \
    | grep "NCBI, UniProt, TIGRFam, Pfam, PRIAM, KEGG, COG, and InterPro, IMG-ER" \
    > 8_other_systems/genomes_grp1.txt

cat 8_other_systems/genome_free_articles_grp.txt \
    | cut -f1,2 \
    | grep "BLAST, similarity, homology" \
    > 8_other_systems/genomes_grp2.txt

cat 8_other_systems/genome_free_articles_grp.txt \
    | cut -f1,2 \
    | grep "GenDB, BLAST, COG, COGnitor" \
    > 8_other_systems/genomes_grp3.txt

cat 8_other_systems/genome_free_articles_grp.txt \
    | cut -f1,2 \
    | grep "InterPro(Scan)" \
    > 8_other_systems/genomes_grp4.txt

cat 8_other_systems/genome_free_articles_grp.txt \
    | awk -F'\t' '{print $1"\tAll"}' \
    > 8_other_systems/genomes_grp_all.txt



